Contents

1 Setup

library(EnrichmentBrowser)
library(GSEABenchmarkeR)
cb.pink <- "#CC79A7"
cb.darkred <- "#B42F32"
cb.red <- "#D55E00"
cb.lightred <- "#DF6747"
cb.blue <- "#0072B2"
cb.yellow <- "#F0E442"
cb.green <- "#009E73"
cb.lightblue <- "#56B4E9"
cb.lightorange <- "#FAAC77"
cb.orange <- "#E69F00"
cb.darkorange <- "#F6893D"
cb.lightgrey <- "#C9C9BD"
cb.darkgrey <- "#878D92"

2 Expression data sources

2.1 Microarray compendium

data.dir <- rappdirs::user_data_dir("GSEABenchmarkeR")
ma.dir <- file.path(data.dir, "GEO2KEGG_preproc")
geo2kegg <- loadEData(ma.dir)
## 
  |                                                                         
  |                                                                   |   0%
  |                                                                         
  |==                                                                 |   2%
  |                                                                         
  |===                                                                |   5%
  |                                                                         
  |=====                                                              |   7%
  |                                                                         
  |======                                                             |  10%
  |                                                                         
  |========                                                           |  12%
  |                                                                         
  |==========                                                         |  14%
  |                                                                         
  |===========                                                        |  17%
  |                                                                         
  |=============                                                      |  19%
  |                                                                         
  |==============                                                     |  21%
  |                                                                         
  |================                                                   |  24%
  |                                                                         
  |==================                                                 |  26%
  |                                                                         
  |===================                                                |  29%
  |                                                                         
  |=====================                                              |  31%
  |                                                                         
  |======================                                             |  33%
  |                                                                         
  |========================                                           |  36%
  |                                                                         
  |==========================                                         |  38%
  |                                                                         
  |===========================                                        |  40%
  |                                                                         
  |=============================                                      |  43%
  |                                                                         
  |==============================                                     |  45%
  |                                                                         
  |================================                                   |  48%
  |                                                                         
  |==================================                                 |  50%
  |                                                                         
  |===================================                                |  52%
  |                                                                         
  |=====================================                              |  55%
  |                                                                         
  |======================================                             |  57%
  |                                                                         
  |========================================                           |  60%
  |                                                                         
  |=========================================                          |  62%
  |                                                                         
  |===========================================                        |  64%
  |                                                                         
  |=============================================                      |  67%
  |                                                                         
  |==============================================                     |  69%
  |                                                                         
  |================================================                   |  71%
  |                                                                         
  |=================================================                  |  74%
  |                                                                         
  |===================================================                |  76%
  |                                                                         
  |=====================================================              |  79%
  |                                                                         
  |======================================================             |  81%
  |                                                                         
  |========================================================           |  83%
  |                                                                         
  |=========================================================          |  86%
  |                                                                         
  |===========================================================        |  88%
  |                                                                         
  |=============================================================      |  90%
  |                                                                         
  |==============================================================     |  93%
  |                                                                         
  |================================================================   |  95%
  |                                                                         
  |=================================================================  |  98%
  |                                                                         
  |===================================================================| 100%
ma.ids <- names(geo2kegg)
ma.ids
##  [1] "GSE1145"            "GSE11906"           "GSE1297"           
##  [4] "GSE14762"           "GSE14924_CD4"       "GSE14924_CD8"      
##  [7] "GSE15471"           "GSE16515"           "GSE16759"          
## [10] "GSE18842"           "GSE19188"           "GSE19420"          
## [13] "GSE19728"           "GSE20153"           "GSE20164"          
## [16] "GSE20291"           "GSE21354"           "GSE22780"          
## [19] "GSE23878"           "GSE24739_G0"        "GSE24739_G1"       
## [22] "GSE30153"           "GSE32676"           "GSE3467"           
## [25] "GSE3585"            "GSE3678"            "GSE38666_epithelia"
## [28] "GSE38666_stroma"    "GSE4107"            "GSE4183"           
## [31] "GSE42057"           "GSE5281_EC"         "GSE5281_HIP"       
## [34] "GSE5281_VCX"        "GSE6956AA"          "GSE6956C"          
## [37] "GSE7305"            "GSE781"             "GSE8671"           
## [40] "GSE8762"            "GSE9348"            "GSE9476"

2.2 RNA-seq compendium

rseq.dir <- file.path(data.dir, "TCGA_preproc")
rseq.raw <- file.path(rseq.dir, "GSE62944_matched_limmavoom", "tcga_paired_voomlimma.rds")
rseq.vst <- file.path(rseq.dir, "GSE62944_matched_vst", "tcga_paired_vst_limma.rds")
rseq.tpm <- file.path(rseq.dir, "cTD_matched_tpm", "cTD_matched_tpm.rds")
tcga.raw <- readRDS(rseq.raw)
tcga.vst <- readRDS(rseq.vst)
tcga.tpm <- readRDS(rseq.tpm)
rseq.ids <- names(tcga.raw)
rseq.ids
##  [1] "BLCA" "BRCA" "COAD" "HNSC" "KICH" "KIRC" "KIRP" "LIHC" "LUAD" "LUSC"
## [11] "PRAD" "READ" "STAD" "THCA" "UCEC"

2.2.1 DE distribution

geo2kegg <- runDE(geo2kegg, padj.method="BH")
bh.padj <- function(se)
{
    rowData(se)$ADJ.PVAL <- p.adjust(rowData(se)$PVAL, method="BH")
    return(se)
}
tcga.raw <- lapply(tcga.raw, bh.padj)
tcga.vst <- lapply(tcga.vst, bh.padj)
plotDEDistribution(geo2kegg)

plotDEDistribution(tcga.raw)

plotDEDistribution(tcga.vst)

Comparison of DE results for different RNA-seq approaches: - raw read counts + voom/limma - vst counts + limma - log2 TPMs + limma

# VST
cors.vst <- vapply(rseq.ids, 
    function(id) 
        cor(rowData(tcga.raw[[id]])$FC, 
            rowData(tcga.vst[[id]])$FC), 
    numeric(1))
sort(round(cors.vst, digits=3))
##  LIHC  BLCA  STAD  KIRP  HNSC  UCEC  THCA  KICH  BRCA  PRAD  KIRC  READ 
## 0.964 0.971 0.975 0.979 0.982 0.985 0.987 0.990 0.991 0.991 0.992 0.992 
##  LUAD  LUSC  COAD 
## 0.993 0.993 0.995
corsp.vst <- vapply(rseq.ids, 
    function(id) 
        cor(-log(rowData(tcga.raw[[id]])$PVAL, base=10), 
            -log(rowData(tcga.vst[[id]])$PVAL, base=10)), 
    numeric(1))
sort(round(corsp.vst, digits=3))
##  KIRP  LIHC  BLCA  KICH  THCA  UCEC  STAD  HNSC  READ  KIRC  LUSC  BRCA 
## 0.964 0.970 0.976 0.976 0.979 0.980 0.982 0.983 0.984 0.985 0.986 0.987 
##  COAD  LUAD  PRAD 
## 0.989 0.989 0.992
# TPM
cors.tpm <- vapply(names(tcga.tpm), 
    function(id) 
        cor(rowData(tcga.tpm[[id]])$FC, 
            rowData(tcga.raw[[id]][names(tcga.tpm[[id]]),])$FC), 
    numeric(1))
sort(round(cors.tpm, digits=3))
##  PRAD  STAD  BLCA  LUAD  THCA  KIRP  LIHC  BRCA  KICH  KIRC  LUSC  HNSC 
## 0.992 0.992 0.993 0.993 0.994 0.995 0.995 0.996 0.996 0.997 0.997 0.998
corsp.tpm <- vapply(names(tcga.tpm), 
    function(id) 
        cor(-log(rowData(tcga.tpm[[id]])$PVAL, base=10), 
            -log(rowData(tcga.raw[[id]][names(tcga.tpm[[id]]),])$PVAL, base=10)), 
    numeric(1))
sort(round(corsp.tpm, digits=3))
##  PRAD  LUAD  KIRP  STAD  BLCA  THCA  LIHC  BRCA  KICH  KIRC  HNSC  LUSC 
## 0.959 0.971 0.975 0.978 0.980 0.984 0.985 0.986 0.986 0.989 0.992 0.992
# log TPM
tcga.logtpm <- lapply(tcga.tpm, 
    function(se)
    { 
        assay(se) <- log(assay(se) + 1, base=2)
        return(se)
    })
tcga.logtpm <- runDE(tcga.logtpm)

cors.logtpm <- vapply(names(tcga.logtpm), 
    function(id) 
        cor(rowData(tcga.logtpm[[id]])$FC, 
            rowData(tcga.raw[[id]][names(tcga.logtpm[[id]]),])$FC), 
    numeric(1))
sort(round(cors.logtpm, digits=3))
##  LIHC  BLCA  KIRP  STAD  THCA  HNSC  KICH  LUAD  PRAD  BRCA  KIRC  LUSC 
## 0.956 0.966 0.967 0.967 0.980 0.982 0.983 0.985 0.985 0.987 0.988 0.989
corsp.logtpm <- vapply(names(tcga.logtpm), 
    function(id) 
        cor(-log(rowData(tcga.logtpm[[id]])$PVAL, base=10), 
            -log(rowData(tcga.raw[[id]][names(tcga.logtpm[[id]]),])$PVAL, base=10)), 
    numeric(1))
sort(round(corsp.logtpm, digits=3))
##  KICH  KIRP  THCA  PRAD  STAD  BRCA  BLCA  LUAD  KIRC  LIHC  LUSC  HNSC 
## 0.875 0.924 0.929 0.948 0.951 0.960 0.963 0.963 0.964 0.964 0.975 0.978
# overall
fc.mat <- data.frame(cors.vst, cors.tpm[rseq.ids], cors.logtpm[rseq.ids])
round(fc.mat, digits=3)
##      cors.vst cors.tpm.rseq.ids. cors.logtpm.rseq.ids.
## BLCA    0.971              0.993                 0.966
## BRCA    0.991              0.996                 0.987
## COAD    0.995                 NA                    NA
## HNSC    0.982              0.998                 0.982
## KICH    0.990              0.996                 0.983
## KIRC    0.992              0.997                 0.988
## KIRP    0.979              0.995                 0.967
## LIHC    0.964              0.995                 0.956
## LUAD    0.993              0.993                 0.985
## LUSC    0.993              0.997                 0.989
## PRAD    0.991              0.992                 0.985
## READ    0.992                 NA                    NA
## STAD    0.975              0.992                 0.967
## THCA    0.987              0.994                 0.980
## UCEC    0.985                 NA                    NA
p.mat <- data.frame(corsp.vst, corsp.tpm[rseq.ids], corsp.logtpm[rseq.ids])
round(p.mat, digits=3)
##      corsp.vst corsp.tpm.rseq.ids. corsp.logtpm.rseq.ids.
## BLCA     0.976               0.980                  0.963
## BRCA     0.987               0.986                  0.960
## COAD     0.989                  NA                     NA
## HNSC     0.983               0.992                  0.978
## KICH     0.976               0.986                  0.875
## KIRC     0.985               0.989                  0.964
## KIRP     0.964               0.975                  0.924
## LIHC     0.970               0.985                  0.964
## LUAD     0.989               0.971                  0.963
## LUSC     0.986               0.992                  0.975
## PRAD     0.992               0.959                  0.948
## READ     0.984                  NA                     NA
## STAD     0.982               0.978                  0.951
## THCA     0.979               0.984                  0.929
## UCEC     0.980                  NA                     NA

2.3 Golub

Frequently used data set, one of the first to use microarray data in a classification context. Golub et al. (1999)

library(golubEsets)
library(vsn)

data(Golub_Train)
exprs(Golub_Train) <- exprs(vsn2(Golub_Train))
## vsn2: 7129 x 38 matrix (1 stratum).
## Please use 'meanSdPlot' to verify the fit.
se <- probe2gene(Golub_Train)
## Loading required package: hu6800.db
## Loading required package: AnnotationDbi
## Loading required package: org.Hs.eg.db
## 
## 
## Encountered 360 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
se$GROUP <- ifelse(se$ALL.AML == "ALL", 1, 0)
se <- deAna(se)
se
## class: SummarizedExperiment 
## dim: 5523 38 
## metadata(3): experimentData annotation protocolData
## assays(1): exprs
## rownames(5523): 6772 2597 ... 3397 7277
## rowData names(4): FC limma.STAT PVAL ADJ.PVAL
## colnames(38): 1 2 ... 32 33
## colData names(12): Samples ALL.AML ... Source GROUP

3 Gene sets

3.1 KEGG

kegg.gs <- getGenesets(org="hsa", db="kegg")
length(kegg.gs)
## [1] 330
(min.size <- EnrichmentBrowser::configEBrowser("GS.MIN.SIZE"))
## [1] 5
(max.size <- EnrichmentBrowser::configEBrowser("GS.MAX.SIZE"))
## [1] 500
ind <- (lengths(kegg.gs) >= min.size) & (lengths(kegg.gs) <= max.size)
kegg.gs <- kegg.gs[ind]
length(kegg.gs)
## [1] 324
MASS::truehist(lengths(kegg.gs), nbins=50) 

3.2 GO

go.gs <- getGenesets(org="hsa", db="go", go.onto="BP")
## 
length(go.gs)
## [1] 12078
ind <- (lengths(go.gs) >= min.size) & (lengths(go.gs) <= max.size)
go.gs <- go.gs[ind]
length(go.gs)
## [1] 4705
MASS::truehist(lengths(go.gs), nbins=50)

4 Benchmarking

res.dir <- file.path(data.dir, "results")
geo.dir <- file.path(res.dir, "GEO2KEGG")

4.1 Runtime

ea.methods <- sbeaMethods()[1:10]
kegg.dir <- file.path(geo.dir, "kegg/perm1k")
ma.kegg.rtimes <- readResults(kegg.dir, ma.ids, 
                                methods=ea.methods, type="runtime")
lengths(ma.kegg.rtimes)
##        ora       safe       gsea        gsa      padog globaltest      roast 
##         42         42         42         42         42         42         42 
##     camera       gsva      samgs 
##         42         42         42
ma.kegg.rtimes[1:2]
## $ora
##            GSE1145           GSE11906            GSE1297           GSE14762 
##              0.360              0.498              0.367              0.459 
##       GSE14924_CD4       GSE14924_CD8           GSE15471           GSE16515 
##              0.309              0.466              0.469              0.424 
##           GSE16759           GSE18842           GSE19188           GSE19420 
##              0.283              0.483              0.339              0.421 
##           GSE19728           GSE20153           GSE20164           GSE20291 
##              0.458              0.430              0.229              0.377 
##           GSE21354           GSE22780           GSE23878        GSE24739_G0 
##              0.286              0.433              0.445              0.271 
##        GSE24739_G1           GSE30153           GSE32676            GSE3467 
##              0.262              0.346              0.288              0.426 
##            GSE3585            GSE3678 GSE38666_epithelia    GSE38666_stroma 
##              0.353              0.283              0.464              0.433 
##            GSE4107            GSE4183           GSE42057         GSE5281_EC 
##              0.278              0.446              0.212              0.291 
##        GSE5281_HIP        GSE5281_VCX          GSE6956AA           GSE6956C 
##              0.441              0.356              0.225              0.222 
##            GSE7305             GSE781            GSE8671            GSE8762 
##              0.247              0.226              0.438              0.248 
##            GSE9348            GSE9476 
##              0.310              0.199 
## 
## $safe
##            GSE1145           GSE11906            GSE1297           GSE14762 
##             11.480             24.782              6.050             10.399 
##       GSE14924_CD4       GSE14924_CD8           GSE15471           GSE16515 
##              8.420              9.809             24.841             13.115 
##           GSE16759           GSE18842           GSE19188           GSE19420 
##              0.837             29.785             41.487             12.182 
##           GSE19728           GSE20153           GSE20164           GSE20291 
##             10.359              8.430              3.225             10.748 
##           GSE21354           GSE22780           GSE23878        GSE24739_G0 
##              7.380              8.942             12.836              3.503 
##        GSE24739_G1           GSE30153           GSE32676            GSE3467 
##              4.290              7.279             11.517              9.048 
##            GSE3585            GSE3678 GSE38666_epithelia    GSE38666_stroma 
##              3.787              7.010             11.406              7.631 
##            GSE4107            GSE4183           GSE42057         GSE5281_EC 
##              6.109             10.940             16.118              6.303 
##        GSE5281_HIP        GSE5281_VCX          GSE6956AA           GSE6956C 
##              7.072              9.420              1.077              5.570 
##            GSE7305             GSE781            GSE8671            GSE8762 
##              5.433              5.184             18.191              5.951 
##            GSE9348            GSE9476 
##             16.891             15.657
go.dir <- file.path(geo.dir, "go_bp")
ma.go.rtimes <- readResults(go.dir, ma.ids, 
                                methods=ea.methods, type="runtime")
lengths(ma.go.rtimes)
##        ora       safe       gsea        gsa      padog globaltest      roast 
##         42         42         42         42         42         42         42 
##     camera       gsva      samgs 
##         42         42         42
bpPlot(ma.kegg.rtimes, what="runtime")

bpPlot(ma.go.rtimes, what="runtime")

Using ggpubr / ggplot2:

library(ggpubr)
plotRuntime2 <- function(rtimes)
{
    x <- do.call(cbind, rtimes)
    df <- reshape2::melt(x)
    df$value <- log(df$value, base=10)
    df$Var2 <- substring(df$Var2, 1, 7)
    medians <- vapply(rtimes, median, numeric(1))
    o <- names(sort(medians))
    o <- substring(o, 1, 7)
    p <- ggboxplot(df, x = "Var2", y = "value", width = 0.8, 
        ylab="log10 runtime [sec]", xlab="", order=o, fill="Var2")
    p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none") + 
        grids("y", linetype = "dashed")
    return(p)
}
p <- plotRuntime2(ma.kegg.rtimes) 
p + geom_label(x=10, y=0, label="1 sec", col="grey") + 
geom_label(x=1.5, y=1, label="10 sec", col="grey") + 
geom_label(x=1.5, y=2, label="1 min 40 sec", col="grey") 

p <- plotRuntime2(ma.go.rtimes)
p + geom_label(x=10, y=1, label="10 sec", col="grey") +
geom_label(x=1.5, y=2, label="1 min 40 sec", col="grey") + 
geom_label(x=1.5, y=3, label="15 min 40 sec", col="grey")

tcga.dir <- file.path(res.dir, "TCGA")
vst.dir <- file.path(tcga.dir, "GSE62944_matched_vst")
rseq.kegg.dir <- file.path(vst.dir, "kegg")
rseq.go.dir <- file.path(vst.dir, "gobp")
rseq.kegg.rtimes <- readResults(rseq.kegg.dir, rseq.ids, 
                                    methods=ea.methods, type="runtime")
rseq.go.rtimes <- readResults(rseq.go.dir, rseq.ids, 
                                    methods=ea.methods, type="runtime")
bpPlot(rseq.kegg.rtimes, what="runtime")

bpPlot(rseq.go.rtimes, what="runtime")

facetplot <- function(ma.kegg, ma.go, rseq.kegg, rseq.go, 
    ylab="% significant sets", vline=6.5, log=FALSE)
{
    l <- list(ma.kegg=ma.kegg, ma.go=ma.go, rseq.kegg=rseq.kegg, rseq.go=rseq.go)
    df <- reshape2::melt(l)
    gsc <- vapply(df$L1, function(x) unlist(strsplit(x,"\\."))[2], 
                    character(1), USE.NAMES=FALSE)
    df <- cbind(df, gsc=gsc)
    df$gsc <- toupper(df$gsc)
    df$gsc <- vapply(df$gsc, function(n) 
                ifelse(n == "GO", paste(n, "BP", sep="-"), n), 
                character(1), USE.NAMES=FALSE)
    df$gsc <- factor(df$gsc, levels=c("KEGG", "GO-BP"))
    colnames(df)[1:2] <- c("dataset", "method")
    colnames(df)[4] <- "compendium"
    df$compendium <- sub("ma.kegg", "GEO2KEGG microarray", df$compendium)
    df$compendium <- sub("rseq.go", "TCGA RNA-seq", df$compendium)
    df$compendium <- sub("rseq.kegg", "TCGA RNA-seq", df$compendium)
    df$compendium <- sub("ma.go", "GEO2KEGG microarray", df$compendium)
    df$method <- substring(df$method, 1, 7)
    if(log) df$value <- log(df$value, base=10)
    o <- sort(vapply(split(df$value, df$method), 
                        median, numeric(1), na.rm=TRUE))
    df$method <- factor(df$method, levels=names(o))
    p <- ggboxplot(df, x = "method", y = "value", 
        width = 0.8, ylab=ylab, xlab="", fill="method")
    p <- ggpar(p, x.text.angle=45, palette = "simpsons", legend="none") 
    if(!is.na(vline)) 
        p <- p + geom_vline(xintercept=vline, linetype="dashed", color = cb.darkgrey)
    p <- facet(p, facet.by=c("compendium", "gsc"))     
    return(p)
}    
p <- facetplot(do.call(cbind, ma.kegg.rtimes), 
            do.call(cbind, ma.go.rtimes), 
            do.call(cbind, rseq.kegg.rtimes), 
            do.call(cbind, rseq.go.rtimes),
            ylab="log10 runtime [sec]",
            vline=NA, log=TRUE)
p + grids("y", linetype = "dashed") +
geom_label(x=9, y=0, label="1 sec", col="grey", size=2.5) +
geom_label(x=9, y=1, label="10 sec", col="grey", size=2.5) +
geom_label(x=2, y=2, label="1 min 40 sec", col="grey", size=2.5) +
geom_label(x=2, y=3, label="15 min 40 sec", col="grey", size=2.5)

Checking different RNA-seq modes (raw vs. vst):

raw.dir <- file.path(tcga.dir, "GSE62944_matched_limmavoom")
raw.kegg.dir <- file.path(raw.dir, "kegg")
raw.kegg.rtimes <- readResults(raw.kegg.dir, rseq.ids, 
                                methods=list.files(raw.kegg.dir), type="runtime")
rtimes <- c(raw.kegg.rtimes[c("camera", "gsva", "roast")], 
            rseq.kegg.rtimes[c(c("camera", "gsva", "roast"))])
names(rtimes) <- paste(names(rtimes), c(rep("raw",3), rep("vst", 3)), sep=".")
rtimes <- rtimes[c(4,1,6,3,5,2)]
rtimes <- lapply(rtimes, log, base = 10)
ylab <- "log10 runtime [sec]"
par(las = 2)
boxplot(rtimes, col = rep(rainbow(3), each=2), ylab = ylab)

safe.kegg.rtimes <- list(vst=rseq.kegg.rtimes$safe, raw=raw.kegg.rtimes$safe)
bpPlot(safe.kegg.rtimes, what="runtime")

4.1.1 Conclusion

Median runtime:

  • KEGG: min: camera (0.25 sec); max: gsea (1.43 min)
  • GO: min: camera(7.7 sec); max: gsea (35.1 min)

Three groups:

  1. simple tests: camera, globaltest, ora, mgsa
  2. light permutation: gsva, safe, roast, ebm, samgs
  3. heavy permutation: gsa, padog, gsea

4.2 Statistical significance

ma.kegg.ranks <- readResults(kegg.dir, ma.ids, 
                                methods=ea.methods, type="ranking")
ma.go.ranks <- readResults(go.dir, ma.ids,
                                methods=ea.methods, type="ranking")
lengths(ma.kegg.ranks)
##        ora       safe       gsea        gsa      padog globaltest      roast 
##         42         42         42         42         42         42         42 
##     camera       gsva      samgs 
##         42         42         42
ma.kegg.ranks$ora[1:2]
## $GSE1145
## DataFrame with 292 rows and 4 columns
##                                               GENE.SET  NR.GENES
##                                            <character> <numeric>
## 1                         hsa05010_Alzheimer's_disease       164
## 2                             hsa03018_RNA_degradation        76
## 3                        hsa05016_Huntington's_disease       187
## 4              hsa04071_Sphingolipid_signaling_pathway       118
## 5   hsa05142_Chagas_disease_(American_trypanosomiasis)       102
## ...                                                ...       ...
## 288                          hsa02010_ABC_transporters        43
## 289                        hsa05033_Nicotine_addiction        40
## 290   hsa04080_Neuroactive_ligand-receptor_interaction       274
## 291        hsa00430_Taurine_and_hypotaurine_metabolism        11
## 292       hsa00524_Butirosin_and_neomycin_biosynthesis         5
##     NR.SIG.GENES      PVAL
##        <numeric> <numeric>
## 1             57  0.000754
## 2             30   0.00146
## 3             62   0.00176
## 4             41   0.00397
## 5             36   0.00503
## ...          ...       ...
## 288            4     0.996
## 289            3     0.998
## 290           43         1
## 291            0         1
## 292            0         1
## 
## $GSE11906
## DataFrame with 292 rows and 4 columns
##                                                         GENE.SET  NR.GENES
##                                                      <character> <numeric>
## 1                            hsa04380_Osteoclast_differentiation       130
## 2                       hsa00051_Fructose_and_mannose_metabolism        32
## 3                                 hsa04668_TNF_signaling_pathway       110
## 4                                hsa00565_Ether_lipid_metabolism        42
## 5                           hsa04917_Prolactin_signaling_pathway        72
## ...                                                          ...       ...
## 288                          hsa00460_Cyanoamino_acid_metabolism         7
## 289                               hsa00750_Vitamin_B6_metabolism         6
## 290                                 hsa00232_Caffeine_metabolism         5
## 291 hsa00400_Phenylalanine,_tyrosine_and_tryptophan_biosynthesis         5
## 292                 hsa00524_Butirosin_and_neomycin_biosynthesis         5
##     NR.SIG.GENES      PVAL
##        <numeric> <numeric>
## 1              8   0.00264
## 2              4   0.00266
## 3              7   0.00404
## 4              4   0.00718
## 5              5    0.0103
## ...          ...       ...
## 288            0         1
## 289            0         1
## 290            0         1
## 291            0         1
## 292            0         1

4.2.1 Nominal p-values

ma.kegg.sig.sets <- evalNrSigSets(ma.kegg.ranks, alpha=0.05, padj="none")
bpPlot(ma.kegg.sig.sets, what="sig.sets")

ma.go.sig.sets <- evalNrSigSets(ma.go.ranks, alpha=0.05, padj="none")
bpPlot(ma.go.sig.sets, what="sig.sets")

rseq.kegg.ranks <- readResults(rseq.kegg.dir, rseq.ids, 
                                methods=ea.methods, type="ranking")
rseq.kegg.sig.sets <- evalNrSigSets(rseq.kegg.ranks, alpha=0.05, padj="none")
rseq.go.ranks <- readResults(rseq.go.dir, rseq.ids, 
                                methods=ea.methods, type="ranking")
rseq.go.sig.sets <- evalNrSigSets(rseq.go.ranks, alpha=0.05, padj="none")

facetplot(ma.kegg.sig.sets, ma.go.sig.sets, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

4.2.2 Random gene sets of increasing size using true sample labels

golub.dir <- file.path(res.dir, "golub")
random.dir <- file.path(golub.dir, "randomGS", "true_labels")
gs.grid <- c(5, 10, 25, 50, 100, 250, 500)
gs.grid <- paste0("gs", gs.grid)
ea.random <- readResults(random.dir, gs.grid, 
                            methods=ea.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.random, alpha=0.05, padj="none")
ind <- order( apply(sig.sets, 2, median) )
sig.sets <- sig.sets[, ind]
round(sig.sets, digits=1)
##       ora camera safe padog gsea gsa gsva roast globaltest samgs
## gs5   5.0      5  5.0     5  7.0 7.0 21.0  24.0         43    54
## gs10  2.0      5  2.0     4  8.0 7.0 26.0  32.0         63    78
## gs25  3.0      3  4.0     5  8.0 3.0 25.0  24.0         94    98
## gs50  3.0      5  5.0     5 10.0 9.0 29.0  33.0         98   100
## gs100 5.0      1  7.0     3  5.0 3.0 23.0  18.0        100   100
## gs250 1.0      0  8.0     6  3.0 7.0 23.0  14.0        100   100
## gs500 3.8      0  7.7     2  1.9 3.6 17.3  11.5        100   100
GSEABenchmarkeR:::plotGSSizeRobustness(sig.sets, what="sig.sets")

df <- reshape2::melt(sig.sets)
colnames(df) <- c("Size", "Method", "value")
df$Method <- substring(df$Method, 1, 7)
df$Method <- factor(df$Method, levels=substring(rev(colnames(sig.sets)),1,7))
df$Size <- sub("^gs", "", df$Size)
col <- rev(get_palette(palette = "simpsons", 10))
p <- ggline(df, x = "Size", y = "value", linetype = "Method", color="Method",
    palette = "simpsons", ylab="% significant sets", xlab="gene set size") +
geom_hline(yintercept=5, linetype="dashed", color = cb.darkgrey) 
p

4.2.3 Adjusted p-values

ma.kegg.sig.sets <- evalNrSigSets(ma.kegg.ranks, alpha=0.05, padj="BH")
bpPlot(ma.kegg.sig.sets, what="sig.sets")

ma.go.sig.sets <- evalNrSigSets(ma.go.ranks, alpha=0.05, padj="BH")
bpPlot(ma.go.sig.sets, what="sig.sets")

rseq.kegg.sig.sets <- evalNrSigSets(rseq.kegg.ranks, alpha=0.05, padj="BH")
rseq.go.sig.sets <- evalNrSigSets(rseq.go.ranks, alpha=0.05, padj="BH")
facetplot(ma.kegg.sig.sets, ma.go.sig.sets, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

4.2.4 Use built-in FDR-control for GSEA and SAFE

fdr.dir <- file.path(kegg.dir, "builtInFDR")
fdr.methods <- c("gsea", "safe") 
ea.kegg.ranks.fdr <- readResults(fdr.dir, ma.ids, 
                                    methods=fdr.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.fdr)
bpPlot(sig.sets, what="sig.sets")

4.2.5 Number of permutations (1k -> 10k)

kegg.dir <- sub("1k$", "10k", kegg.dir)
perm.methods <- c("gsa", "gsea", "padog", "roast", "safe", "samgs")
ea.kegg.ranks.10k <- readResults(kegg.dir, ma.ids, 
                                    methods=perm.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.10k, alpha=0.05, padj="BH")
bpPlot(sig.sets, what="sig.sets")

fdr.dir <- file.path(kegg.dir, "builtInFDR")
ea.kegg.ranks.fdr.10k <- readResults(fdr.dir, ma.ids, 
                                        methods=fdr.methods, type="ranking")
sig.sets <- evalNrSigSets(ea.kegg.ranks.fdr.10k)
bpPlot(sig.sets, what="sig.sets")

4.2.6 Correlation with DE

ma.de <- vapply(geo2kegg, function(se) GSEABenchmarkeR:::.fractDE(se), numeric(2))
ma.de <- ma.de["p",]
rseq.de <- vapply(tcga.raw, function(se) GSEABenchmarkeR:::.fractDE(se), numeric(2))
rseq.de <- rseq.de["p",]
plotCorDE <- function(de, sig.sets)
{
    cors <- vapply(ea.methods, 
                function(m) cor(de, sig.sets[,m], use="complete.obs"), 
                numeric(1))
    o <- order(cors, decreasing=TRUE)
    df <- reshape2::melt(sig.sets)
    colnames(df) <- c("Dataset", "Method", "value")
    df$xvalue <- de[as.vector(df$Dataset)] 
    df$Method <- substring(df$Method, 1, 7)
    df$Method <- factor(df$Method, 
        levels=substring(colnames(sig.sets)[o],1,7))
    col <- rev(get_palette(palette = "simpsons", 10))
    ggline(df, x = "xvalue", y = "value", numeric.x.axis = TRUE, 
        linetype = "Method", color="Method",
        palette = "simpsons", ylab="% significant sets", xlab="% DE genes")
}
plotCorDE(ma.de, ma.go.sig.sets)
## Warning in cor(de, sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero
## Warning: Removed 2 rows containing missing values (geom_point).

cor.facetplot <- function(de, kegg.sig.sets, go.sig.sets)
{
    cors <- vapply(ea.methods, 
                function(m) cor(de, kegg.sig.sets[,m], use="complete.obs"), 
                numeric(1))
    o <- order(cors, decreasing=TRUE)

    l <- list(KEGG=kegg.sig.sets, GO=go.sig.sets)
    df <- reshape2::melt(l)
    colnames(df)[c(1:2,4)] <- c("dataset", "method", "gsc")
    df$xvalue <- de[as.vector(df$dataset)] 
    df$method <- substring(df$method, 1, 7)
    df$method <- factor(df$method, 
        levels=substring(colnames(kegg.sig.sets)[o],1,7))
    col <- rev(get_palette(palette = "simpsons", 10))
    p <- ggline(df, x = "xvalue", y = "value", numeric.x.axis = TRUE, 
        linetype = "method", color="method",
        palette = "simpsons", ylab="% significant sets", xlab="% DE genes")
    p <- facet(p, facet.by="gsc")     
    return(p)
}
cor.facetplot(ma.de, ma.kegg.sig.sets, ma.go.sig.sets)
## Warning in cor(de, kegg.sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero
## Warning: Removed 4 rows containing missing values (geom_point).

cor.facetplot(rseq.de, rseq.kegg.sig.sets, rseq.go.sig.sets)
## Warning in cor(de, kegg.sig.sets[, m], use = "complete.obs"): the standard
## deviation is zero

4.2.7 Type I error rate

kegg.dir <- file.path(golub.dir, "kegg")
go.dir <- file.path(golub.dir, "go")
tyI <- evalTypeIError("globaltest", se, gs=kegg.gs, alpha=0.05, nperm=1000)
golgt.file <- file.path(kegg.dir, "globaltest.rds")
tyI.globaltest <- readRDS(golgt.file)
tyI.globaltest
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.006515 0.016287 0.050866 0.056189 0.732899
tyI.samgs <- readRDS(sub("globaltest", "samgs", golgt.file))
tyI.samgs
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.003257 0.009772 0.050039 0.039088 0.833876

Using nominal DE p-values for ORA and EBM.

res <- evalTypeIError(ea.methods, se, gs=kegg.gs, save2file=TRUE)
readTyI <- function(res.dir)
{
    res.files <- file.path(res.dir, paste0(c(sbeaMethods()[1:10],"camera_igcNA"), ".rds"))
    res.files <- res.files[file.exists(res.files)]
    res <- sapply(res.files, readRDS)
    colnames(res) <- basename(colnames(res))
    colnames(res) <- sub(".rds$", "", colnames(res))
    return(res)
}
kegg.res <- readTyI(kegg.dir)
colnames(kegg.res)[11] <- sub("_igcNA$", "*", colnames(kegg.res)[11])
go.res <- readTyI(go.dir)
colnames(go.res)[11] <- sub("_igcNA$", "*", colnames(go.res)[11])
GSEABenchmarkeR:::plotTypeIError(kegg.res)

GSEABenchmarkeR:::plotTypeIError(go.res)

plotTypeIError2 <- function(data, ylabel=0.4)
{
    data <- t(data)
    rownames(data) <- substring(rownames(data), 1, 7)
    data <- data[order(data[,"Max."] - data[,"Mean"]),]
    df <- data.frame(methods=rownames(data), data)
    colnames(df)[2:7] <- c("y0", "y25", "y50", "mean", "y75", "y100") 
    df[,1] <- factor(df[,1], levels=df[,1])
    df.points <- data.frame(x=1:11, y=df$mean)
    p <- ggplot() + 
        geom_boxplot(data=df, width = 0.8,
            aes(x=df[,1], ymin=y0, lower=y25, middle=y50, upper=y75, ymax=y100), 
            stat="identity", fill="grey92") + theme_pubr()
    #get_palette(palette = "simpsons", 10)
    p <- ggpar(p, x.text.angle=45, legend="none") + xlab("") + ylab("type I error rate") +
    geom_point(data=df.points, aes(x=x, y=y), color=cb.blue) +
    geom_hline(yintercept=0.05, linetype="dashed", color = cb.red) + 
    geom_vline(xintercept=7.5, linetype="dashed", color = cb.darkgrey) +
    geom_label(aes(x=4, y=ylabel), label="competitive", color = cb.darkgrey) +
    geom_label(aes(x=9.5, y=ylabel), label="self-contained", color = cb.darkgrey) 
    return(p)
}
plotTypeIError2(kegg.res, ylabel=0.8) 

plotTypeIError2(go.res) 

4.2.8 Type I error rate: random gene sets of increasing size with shuffled sample labels

TODO: recompute for ORA and EBM

random.dir <- file.path(golub.dir, "randomGS", "shuffled_labels")
ea.random.typeI <- readResults(random.dir, gs.grid, 
                                methods=ea.methods, type="typeI")
ea.random.typeI
## $ora
##             gs5    gs10    gs25    gs50   gs100   gs250        gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## 1st Qu. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Median  0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Mean    0.00012 0.00017 0.00037 0.00084 0.00115 0.00245 0.0002142857
## 3rd Qu. 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.0000000000
## Max.    0.03000 0.04000 0.05000 0.06000 0.08000 0.09000 0.0952380952
## 
## $safe
##             gs5    gs10   gs25   gs50   gs100   gs250      gs500
## Min.    0.00000 0.00000 0.0000 0.0000 0.01000 0.00000 0.00000000
## 1st Qu. 0.04000 0.03000 0.0300 0.0400 0.03000 0.03000 0.01851852
## Median  0.05000 0.05000 0.0500 0.0500 0.05000 0.05000 0.03703704
## Mean    0.04961 0.04874 0.0489 0.0492 0.04834 0.04814 0.04840741
## 3rd Qu. 0.06000 0.06000 0.0600 0.0600 0.06000 0.06000 0.07407407
## Max.    0.12000 0.11000 0.1200 0.1200 0.15000 0.14000 0.16666667
## 
## $gsea
##             gs5    gs10    gs25    gs50   gs100 gs250      gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.000 0.00000000
## 1st Qu. 0.04000 0.04000 0.03000 0.03000 0.03000 0.020 0.00000000
## Median  0.05000 0.05000 0.05000 0.05000 0.04000 0.040 0.01818182
## Mean    0.04956 0.05336 0.04772 0.04876 0.04812 0.053 0.04500000
## 3rd Qu. 0.06000 0.07000 0.06000 0.06000 0.06750 0.070 0.05454545
## Max.    0.11000 0.13000 0.11000 0.13000 0.14000 0.360 0.41818182
## 
## $gsa
##             gs5    gs10    gs25   gs50  gs100  gs250      gs500
## Min.    0.01000 0.01000 0.00000 0.0000 0.0000 0.0000 0.00000000
## 1st Qu. 0.04000 0.03750 0.03000 0.0400 0.0400 0.0400 0.02173913
## Median  0.05000 0.05000 0.05000 0.0500 0.0500 0.0500 0.04347826
## Mean    0.04995 0.04905 0.04815 0.0543 0.0496 0.0493 0.04826087
## 3rd Qu. 0.06000 0.06000 0.06000 0.0700 0.0600 0.0600 0.06521739
## Max.    0.12000 0.11000 0.11000 0.1200 0.1000 0.1200 0.15217391
## 
## $padog
##             gs5    gs10    gs25    gs50   gs100   gs250      gs500
## Min.    0.01000 0.00000 0.02000 0.01000 0.01000 0.01000 0.00000000
## 1st Qu. 0.04000 0.04000 0.04000 0.04000 0.04000 0.04000 0.03448276
## Median  0.05000 0.05000 0.05000 0.05000 0.05000 0.05000 0.05172414
## Mean    0.05102 0.05062 0.04896 0.05064 0.05004 0.04896 0.04922414
## 3rd Qu. 0.06000 0.06000 0.06000 0.06000 0.06000 0.06000 0.06896552
## Max.    0.11000 0.11000 0.09000 0.09000 0.09000 0.09000 0.10344828
## 
## $globaltest
##             gs5    gs10    gs25    gs50   gs100   gs250      gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.02000 0.02000 0.01000 0.00000 0.00000 0.00000 0.00000000
## Median  0.04000 0.04000 0.03000 0.02000 0.01000 0.00000 0.00000000
## Mean    0.04865 0.05001 0.05124 0.04479 0.05552 0.04443 0.05838889
## 3rd Qu. 0.07000 0.07000 0.06000 0.05000 0.04000 0.01000 0.00000000
## Max.    0.42000 0.56000 0.65000 0.67000 0.99000 1.00000 1.00000000
## 
## $roast
##             gs5    gs10    gs25    gs50   gs100   gs250      gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.03000 0.03000 0.03000 0.02000 0.02000 0.01000 0.00000000
## Median  0.04000 0.04000 0.04000 0.04000 0.04000 0.02000 0.00000000
## Mean    0.04885 0.04826 0.04952 0.04811 0.04984 0.04779 0.05170370
## 3rd Qu. 0.06000 0.06000 0.06000 0.06000 0.06000 0.05000 0.03703704
## Max.    0.26000 0.20000 0.32000 0.32000 0.33000 0.63000 0.83333333
## 
## $camera
##             gs5    gs10    gs25    gs50   gs100   gs250 gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.00000     0
## 1st Qu. 0.03000 0.03000 0.02000 0.01000 0.00000 0.00000     0
## Median  0.04000 0.04000 0.03000 0.01000 0.00000 0.00000     0
## Mean    0.04444 0.04522 0.02775 0.01617 0.00576 0.00023     0
## 3rd Qu. 0.06000 0.06000 0.04000 0.02000 0.01000 0.00000     0
## Max.    0.12000 0.11000 0.11000 0.07000 0.04000 0.01000     0
## 
## $gsva
##             gs5    gs10    gs25    gs50   gs100   gs250      gs500
## Min.    0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000000
## 1st Qu. 0.03000 0.03000 0.03000 0.03000 0.03000 0.03000 0.01851852
## Median  0.05000 0.05000 0.05000 0.05000 0.05000 0.05000 0.03703704
## Mean    0.05061 0.05017 0.05093 0.04956 0.05103 0.04923 0.05075926
## 3rd Qu. 0.07000 0.07000 0.07000 0.07000 0.07000 0.06000 0.07407407
## Max.    0.26000 0.23000 0.19000 0.16000 0.20000 0.18000 0.24074074
## 
## $samgs
##             gs5    gs10    gs25   gs50  gs100   gs250      gs500
## Min.    0.00000 0.00000 0.00000 0.0000 0.0000 0.00000 0.00000000
## 1st Qu. 0.02000 0.01000 0.00000 0.0000 0.0000 0.00000 0.00000000
## Median  0.04000 0.03000 0.02000 0.0000 0.0000 0.00000 0.00000000
## Mean    0.04897 0.05033 0.05198 0.0422 0.0557 0.04307 0.05575926
## 3rd Qu. 0.06000 0.06000 0.05000 0.0300 0.0100 0.00000 0.00000000
## Max.    0.52000 0.77000 0.92000 0.8600 1.0000 1.00000 1.00000000
mean.mat <- vapply(ea.random.typeI, function(x) x["Mean",], numeric(7))
med.mat <- vapply(ea.random.typeI, function(x) x["Median",], numeric(7))
GSEABenchmarkeR:::plotGSSizeRobustness(mean.mat, what="typeI")

GSEABenchmarkeR:::plotGSSizeRobustness(med.mat, what="typeI")

4.2.9 Conclusion

  • Nominal p-values: Three groups:
    1. 5-10% sig sets: camera, gsea, padog, gsa, safe, ora
    2. 25% sig sets: roast, gsva
    3. >90%: globaltest, samgs (show a clear gene set size dependency)
  • Adjusted p-values: flattens out signal for most methods, independent of gene set DB (GO/KEGG), number of permutations, and whether a built-in correction is used; recommendable for roast and gsva; does not make a difference for globaltest and samgs

  • Type I error rate:
    1. Inflated type I error rates for camera (real gene sets) and gsa (real and random gene sets)
    2. Many methods show deflated median type I error rates with increasing gene set size
    3. globaltest and samgs accurately control the type I error rate, yet their sensitive null hypothesis (self-contained) seems to render them of limited practical relevance

From the globaltest vignette (which is a top 5% Bioc package):

Still, it is true that the null hypotheses that the global test tests is false even if only a single gene in the gene set is associated with the phenotype; especially smaller gene sets may therefore become significant as a result of only a single significant gene.
However, because the test is directed especially against the alternative that there are many associated genes, such examples are rare among larger gene sets.

Rare = >90% ??

4.3 Phenotype relevance

4.3.1 MalaCards disease relevance rankings

data.dir <- system.file("extdata", package="GSEABenchmarkeR")
mala.kegg.file <- file.path(data.dir, "malacards", "KEGG.rds")
mala.go.file <- file.path(data.dir, "malacards", "GO_BP.rds")
mala.kegg <- readRDS(mala.kegg.file)
mala.go <- readRDS(mala.go.file)
vapply(mala.kegg, nrow, integer(1))
##  ACC  ALZ BLCA BRCA CESC CHOL  CML COAD  CRC  DCM DLBC DMND ESCA  GBM HNSC 
##    9   57   65  142   22   33   56   28  161   23   52   99   90   99   72 
## HUNT KICH KIRC KIRP LAML  LES  LGG LIHC LUAD LUSC MESO   OV PAAD PARK PCPG 
##   34    4    8    8  108   49   24   98   54   23    3   31   70   39   12 
## PDCO PRAD READ SARC SKCM STAD TGCT THCA THYM UCEC  UCS  UVM 
##   31   12    2   73   42   24   24   81   61   90   29   55
mala.kegg$ALZ
## DataFrame with 57 rows and 4 columns
##                                              TITLE REL.SCORE MATCHED.GENES
##                                        <character> <numeric>     <integer>
## hsa05010                        Alzheimers disease     84.12            28
## hsa04932 Non-alcoholic fatty liver disease (NAFLD)     84.12             7
## hsa04726                      Serotonergic synapse     49.19             8
## hsa04728                      Dopaminergic synapse     49.19             8
## hsa04713                     Circadian entrainment     49.19             5
## ...                                            ...       ...           ...
## hsa05310                                    Asthma      9.81             1
## hsa05416                         Viral myocarditis      9.81             2
## hsa05330                       Allograft rejection      9.81             1
## hsa05332                 Graft-versus-host disease      9.81             2
## hsa05321          Inflammatory bowel disease (IBD)      9.81             2
##          TOTAL.GENES
##            <integer>
## hsa05010         177
## hsa04932         160
## hsa04726         115
## hsa04728         130
## hsa04713          98
## ...              ...
## hsa05310          35
## hsa05416          64
## hsa05330          41
## hsa05332          45
## hsa05321          67
mala.kegg$BRCA
## DataFrame with 142 rows and 4 columns
##                            TITLE REL.SCORE MATCHED.GENES TOTAL.GENES
##                      <character> <numeric>     <integer>   <integer>
## hsa05210       Colorectal cancer     166.1            35          70
## hsa05213      Endometrial cancer     166.1            23          61
## hsa05221  Acute myeloid leukemia     166.1            16          60
## hsa05218                Melanoma     166.1            35          78
## hsa05215         Prostate cancer     166.1            40          95
## ...                          ...       ...           ...         ...
## hsa05020          Prion diseases     13.23             6          43
## hsa05144                 Malaria     11.34             6          55
## hsa05143 African trypanosomiasis     11.28             5          36
## hsa04720  Long-term potentiation     10.93             7          67
## hsa05134           Legionellosis     10.81             6          59

4.3.2 Mapping between dataset ID and disease code

d2d.file <- file.path(data.dir, "malacards", "GseId2Disease.txt")
d2d.map <- readDataId2diseaseCodeMap(d2d.file)
head(d2d.map)
##      GSE1145     GSE11906      GSE1297     GSE14762 GSE14924_CD4 
##        "DCM"       "PDCO"        "ALZ"       "KIRC"       "LAML" 
## GSE14924_CD8 
##       "LAML"
d2d.tcga <- rseq.ids
names(d2d.tcga) <- rseq.ids

4.3.3 Relevance score of a gene set ranking

ma.kegg.ranks$ora$GSE1297
## DataFrame with 290 rows and 4 columns
##                                               GENE.SET  NR.GENES
##                                            <character> <numeric>
## 1                   hsa00190_Oxidative_phosphorylation       106
## 2                         hsa05010_Alzheimer's_disease       148
## 3                         hsa05012_Parkinson's_disease       115
## 4                        hsa05016_Huntington's_disease       162
## 5   hsa04932_Non-alcoholic_fatty_liver_disease_(NAFLD)       135
## ...                                                ...       ...
## 286                               hsa03040_Spliceosome       112
## 287   hsa04914_Progesterone-mediated_oocyte_maturation        76
## 288                       hsa00232_Caffeine_metabolism         5
## 289                hsa00460_Cyanoamino_acid_metabolism         5
## 290       hsa00524_Butirosin_and_neomycin_biosynthesis         5
##     NR.SIG.GENES      PVAL
##        <numeric> <numeric>
## 1             55  1.98e-12
## 2             67   2.6e-11
## 3             56  3.43e-11
## 4             70  1.28e-10
## 5             48  7.08e-05
## ...          ...       ...
## 286           12     0.999
## 287            5         1
## 288            0         1
## 289            0         1
## 290            0         1
obs.score <- evalRelevance(ma.kegg.ranks$ora$GSE1297, mala.kegg$ALZ)
obs.score
## [1] 802.6036

4.3.4 Theoretical optimum

gs.names <- ma.kegg.ranks$ora$GSE1297$GENE.SET
gs.ids <- substring(gs.names, 1, 8)
opt.score <- compOpt(mala.kegg$ALZ, gs.ids)
opt.score
## [1] 1200.644
round(obs.score / opt.score * 100, digits=2)
## [1] 66.85

4.3.5 Random relevance score distribution

rand.scores <- compRand(mala.kegg$ALZ, gs.ids, perm=50)
summary(rand.scores)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   495.6   602.3   636.8   636.3   675.3   725.0
(sum(rand.scores >= obs.score) + 1) / 51
## [1] 0.01960784

4.3.6 Cross-dataset relevance score distribution

ma.kegg.rel.sets <- evalRelevance(ma.kegg.ranks, mala.kegg, d2d.map)
ma.go.rel.sets <- evalRelevance(ma.go.ranks, mala.go, d2d.map)
bpPlot(ma.kegg.rel.sets, what="rel.sets")

rseq.kegg.rel.sets <- evalRelevance(rseq.kegg.ranks, mala.kegg, d2d.tcga)
rseq.go.rel.sets <- evalRelevance(rseq.go.ranks, mala.go, d2d.tcga)
facetplot(ma.kegg.rel.sets, ma.go.rel.sets, rseq.kegg.rel.sets, 
    rseq.go.rel.sets, ylab="% optimal relevance score", vline=4.5)